
**********
* Readme *
**********

* This script pre-processes the original pnad files:

* [1] import a subset of the original dataset (i.e. all observations, but only variables we'll use);
* [2] set identifiers;
* [3] merge deflators;
* [4] assign appropriate attributes to the variables;
* [5] adjust monetary variables at prices of 15 jan 2018.

* Note: Original pnad sample size: 

*    date  |   individuals   households 
*  --------+---------------------------
*   2017m3 |     572,379      188,925
*   2017m6 |     568,313      188,275
*   2017m9 |     566,671      187,984
*  2017m12 |     561,288      186,882
*   2018m3 |     560,741      187,443
*   2018m6 |     556,186      187,032
*   2018m9 |     559,761      188,416
*  2018m12 |     554,211      187,383
  
* Note: Constant prices and population totals as of 15 jan 2018.


* Root folder (PATH TO BE DEFINED BY THE USER)
**********************************************
clear all
global analysis "C:/***/replication_package"


* Timestamped log
*****************
global today = strofreal(date(c(current_date), "DMY"), "%tdYYNNDD")
log using "${analysis}/code/logs/2_3_build_pnad_attributes_${today}.smcl", replace


***************************
* [A] Import stacked data *
***************************

* Import
use "${analysis}/data/1_2_pnad.dta", clear

***********************
* [B] Set identifiers *
***********************

* Set interview identifier (master key)
***************************************

* Uniquely identify each *interview*: PSU, house, group, order, year, quarter
* Can be used as master key to merge basic set with extra questions from interview 1 or 5, as well as deflators

* Code selected variables as string
ds UPA V1008 V1014 V2003 Ano Trimestre UF

foreach var in `r(varlist)' {
	tostring `var', replace
}

* Interview identifier
gen int_id = UPA + "-" + V1008 + "-" + V1014 + "-" + V2003 + "-" + Ano + "-" + Trimestre
label var int_id "Interview unique identifier (master key)"
drop V2003

* int_id must provide unique key
codebook int_id

* Set longitudinal identifiers - Households
*******************************************

* Uniquely identify each *household*: PSU, house, group
* Can be used to match households across waves
gen hh_id = UPA + "-" + V1008 + "-" + V1014
label var hh_id "Household unique identifier"
drop V1008 V1014

* Interview wave
rename V1016 interview_wave
label var interview_wave "Interview wave at given houshold (1 to 5)"

* Set longitudinal identifiers - Individuals
********************************************

* Unique *individual* identifiers, as proposed in Ribas and Soares (2008)
rename idind ind_id
label var ind_id "Individual unique identifier"
order hh_id ind_id int_id

* Set preliminary date variables
********************************

* Combine year, month and day into variable "date"
gen month = Trimestre
replace month = "03" if Trimestre == "1"
replace month = "06" if Trimestre == "2"
replace month = "09" if Trimestre == "3"
replace month = "12" if Trimestre == "4"

gen date_string = Ano + "-" + month
label var date_string "Interview date (characters)"

* Set date MEASURED IN MONTHS (the basic reference unit in all files!)
gen date = date(date_string, "YM")
replace date = mofd(date) // Key step, where we ask date to be measured in months
format %tm date
label var date "Interview date"

***********************
* [C] Merge deflators *
***********************

* Set state_date key
gen state_date = UF + "-" + date_string
label var state_date "State and date"

* Merge the deflators
merge m:1 state_date using "${analysis}/data/1_3_aux_pnad_deflator_quarter.dta", nogenerate keep(match)

* Survey attributes
*******************
rename UPA psu_id
label var psu_id "Survey primary sampling unit id"

rename V1027 pre_weight
tostring Estrato, generate(pnad_strata)

* Import strata info
sort pnad_strata
merge m:1 pnad_strata using "${analysis}/data/1_3_aux_strata_info.dta", keepusing(strata_id urban state region pop_region) nogenerate
drop Estrato pnad_strata
sort strata_id int_id 

* Poststratification correction (with quarterly pop. projection)
rename V1028 pweight
label var pweight "Survey sampling weight"

* Set time attributes
*********************

* Year (categorical and dummies)
encode Ano, generate(year)
label var year "Year when interview took place"

destring Ano, replace
replace Ano = Ano - 2000
levelsof Ano, local(levels)
foreach i in `levels' {
gen byte y_`i' = (Ano == `i')
}
 
drop Ano

* Quarter (categorical and dummies)
encode Trimestre, generate(quarter)
label var quarter "Quarter when interview took place"

destring Trimestre, replace
levelsof Trimestre, local(levels)
foreach i in `levels' {
gen byte q`i' = (Trimestre == `i')
}

drop Trimestre

* Year * quarter (dummies)
foreach i of varlist y_* {
   foreach j of varlist q? {
   gen byte `i'`j' = `i' * `j' 
   }
}

drop y_?? q?

label var y_17q1 "2017, 1st quarter"
label var y_17q2 "2017, 2nd quarter"
label var y_17q3 "2017, 3rd quarter"
label var y_17q4 "2017, 4th quarter"

label var y_18q1 "2018, 1st quarter"
label var y_18q2 "2018, 2nd quarter"
label var y_18q3 "2018, 3rd quarter"
label var y_18q4 "2018, 4th quarter"

describe y_*

* Year and quarter (categorical) 
gen year_quarter = date
label define year_quarter ///
  686 "2017, 1st quarter" 689 "2017, 2nd quarter" 692 "2017, 3rd quarter" 695 "2017, 4th quarter" ///
  698 "2018, 1st quarter" 701 "2018, 2nd quarter" 704 "2018, 3rd quarter" 707 "2018, 4th quarter" ///
  710 "2019, 1st quarter" 713 "2019, 2nd quarter" 716 "2019, 3rd quarter" 719 "2019, 4th quarter"

label values year_quarter year_quarter

* Set place attributes
**********************

* Regions (indicators) using common set for both POF and PNAD
gen byte region_RO_c = (region == 111)
gen byte region_RO_o = (region == 113)

gen byte region_AC_c = (region == 121)
gen byte region_AC_o = (region == 123)

gen byte region_AM_c = (region == 131)
gen byte region_AM_m = (region == 132)
gen byte region_AM_o = (region == 133)

gen byte region_RR_c = (region == 141)
gen byte region_RR_o = (region == 143)

gen byte region_PA_c = (region == 151)
gen byte region_PA_m = (region == 152)
gen byte region_PA_o = (region == 153)

gen byte region_AP_c = (region == 161)
gen byte region_AP_m = (region == 162)
gen byte region_AP_o = (region == 163)

gen byte region_TO_c = (region == 171)
gen byte region_TO_o = (region == 173)

gen byte region_MA_c = (region == 211)
gen byte region_MA_m = (region == 212)
gen byte region_MA_o = (region == 213)

gen byte region_PI_c = (region == 221)
gen byte region_PI_o = (region == 223)

gen byte region_CE_c = (region == 231)
gen byte region_CE_m = (region == 232)
gen byte region_CE_o = (region == 233)

gen byte region_RN_c = (region == 241)
gen byte region_RN_m = (region == 242)
gen byte region_RN_o = (region == 243)

gen byte region_PB_c = (region == 251)
gen byte region_PB_m = (region == 252)
gen byte region_PB_o = (region == 253)

gen byte region_PE_c = (region == 261)
gen byte region_PE_m = (region == 262)
gen byte region_PE_o = (region == 263)

gen byte region_AL_c = (region == 271)
gen byte region_AL_m = (region == 272)
gen byte region_AL_o = (region == 273)

gen byte region_SE_c = (region == 281)
gen byte region_SE_m = (region == 282)
gen byte region_SE_o = (region == 283)

gen byte region_BA_c = (region == 291)
gen byte region_BA_m = (region == 292)
gen byte region_BA_o = (region == 293)

gen byte region_MG_c = (region == 311)
gen byte region_MG_m = (region == 312)
gen byte region_MG_o = (region == 313)

gen byte region_ES_c = (region == 321)
gen byte region_ES_m = (region == 322)
gen byte region_ES_o = (region == 323)

gen byte region_RJ_c = (region == 331)
gen byte region_RJ_m = (region == 332)
gen byte region_RJ_o = (region == 333)

gen byte region_SP_c = (region == 351)
gen byte region_SP_m = (region == 352)
gen byte region_SP_o = (region == 353)

gen byte region_PR_c = (region == 411)
gen byte region_PR_m = (region == 412)
gen byte region_PR_o = (region == 413)

gen byte region_SC_c = (region == 421)
gen byte region_SC_m = (region == 422)
gen byte region_SC_o = (region == 423)

gen byte region_RS_c = (region == 431)
gen byte region_RS_m = (region == 432)
gen byte region_RS_o = (region == 433)

gen byte region_MS_c = (region == 501)
gen byte region_MS_o = (region == 503)

gen byte region_MT_c = (region == 511)
gen byte region_MT_m = (region == 512)
gen byte region_MT_o = (region == 513)

gen byte region_GO_c = (region == 521)
gen byte region_GO_m = (region == 522)
gen byte region_GO_o = (region == 523)

gen byte region_DF_c = (region == 531)

label var region_RO_c "RO, capital"
label var region_RO_o "RO, remaining areas"

label var region_AC_c "AC, capital"
label var region_AC_o "AC, remaining areas"

label var region_AM_c "AM, capital"
label var region_AM_m "AM, metropolitan area"
label var region_AM_o "AM, remaining areas"

label var region_RR_c "RR, capital"
label var region_RR_o "RR, remaining areas"

label var region_PA_c "PA, capital"
label var region_PA_m "PA, metropolitan area"
label var region_PA_o "PA, remaining areas"

label var region_AP_c "AP, capital"
label var region_AP_m "AP, metropolitan area"
label var region_AP_o "AP, remaining areas"

label var region_TO_c "TO, capital"
label var region_TO_o "TO, remaining areas"

label var region_MA_c "MA, capital"
label var region_MA_m "MA, metropolitan area"
label var region_MA_o "MA, remaining areas"

label var region_PI_c "PI, capital"
label var region_PI_o "PI, remaining areas"

label var region_CE_c "CE, capital"
label var region_CE_m "CE, metropolitan area"
label var region_CE_o "CE, remaining areas"

label var region_RN_c "RN, capital"
label var region_RN_m "RN, metropolitan area"
label var region_RN_o "RN, remaining areas"

label var region_PB_c "PB, capital"
label var region_PB_m "PB, metropolitan area"
label var region_PB_o "PB, remaining areas"

label var region_PE_c "PE, capital"
label var region_PE_m "PE, metropolitan area"
label var region_PE_o "PE, remaining areas"

label var region_AL_c "AL, capital"
label var region_AL_m "AL, metropolitan area"
label var region_AL_o "AL, remaining areas"

label var region_SE_c "SE, capital"
label var region_SE_m "SE, metropolitan area"
label var region_SE_o "SE, remaining areas"

label var region_BA_c "BA, capital"
label var region_BA_m "BA, metropolitan area"
label var region_BA_o "BA, remaining areas"

label var region_MG_c "MG, capital"
label var region_MG_m "MG, metropolitan area"
label var region_MG_o "MG, remaining areas"

label var region_ES_c "ES, capital"
label var region_ES_m "ES, metropolitan area"
label var region_ES_o "ES, remaining areas"

label var region_RJ_c "RJ, capital"
label var region_RJ_m "RJ, metropolitan area"
label var region_RJ_o "RJ, remaining areas"

label var region_SP_c "SP, capital"
label var region_SP_m "SP, metropolitan area"
label var region_SP_o "SP, remaining areas"

label var region_PR_c "PR, capital"
label var region_PR_m "PR, metropolitan area"
label var region_PR_o "PR, remaining areas"

label var region_SC_c "SC, capital"
label var region_SC_m "SC, metropolitan area"
label var region_SC_o "SC, remaining areas"

label var region_RS_c "RS, capital"
label var region_RS_m "RS, metropolitan area"
label var region_RS_o "RS, remaining areas"

label var region_MS_c "MS, capital"
label var region_MS_o "MS, remaining areas"

label var region_MT_c "MT, capital"
label var region_MT_m "MT, metropolitan area"
label var region_MT_o "MT, remaining areas"

label var region_GO_c "GO, capital"
label var region_GO_m "GO, metropolitan area"
label var region_GO_o "GO, remaining areas"

label var region_DF_c "DF, capital"

**************************************
* [D] Set individual characteristics *
**************************************

* Gender (indicator)
gen male = inlist(V2007, 1)
label define male 0 "Female" 1 "Male"
label values male male
label var male "Gender male"

* Race (indicator)
gen white = inlist(V2010, 1, 3)
label define white 0 "Non white" 1 "White"
label values white white
label var white "Race white"

* Race gender interaction (categorical)
gen     race_gender = 1 if inlist(V2010, 2, 4, 5, 9) & inlist(V2007, 2)  // nonwhite female
replace race_gender = 2 if inlist(V2010, 1, 3)       & inlist(V2007, 2)  // white female
replace race_gender = 3 if inlist(V2010, 2, 4, 5, 9) & inlist(V2007, 1)  // nonwhite male
replace race_gender = 4 if inlist(V2010, 1, 3)       & inlist(V2007, 1)  // white male

label define rg 1 "Nonwhite female" 2 "White female" 3 "Nonwhite male"  4 "White male"
label values race_gender rg
label var race_gender "Race and gender" 
tab race_gender, missing

* Race gender interaction (indicators)
gen rg_1_nonwhite_female = (race_gender == 1)
gen rg_2_white_female    = (race_gender == 2)
gen rg_3_nonwhite_male   = (race_gender == 3)
gen rg_4_white_male      = (race_gender == 4)

label var rg_1_nonwhite_female "Nonwhite female"
label var rg_2_white_female    "White female"
label var rg_3_nonwhite_male   "Nonwhite male"
label var rg_4_white_male      "White male"

* Age (numeric)
rename V2009 years_age
label var years_age "Age, in years"

* Age (categorical)
recode years_age ///
  (0/24=1   "Under 25") ///
  (25/34=2  "25-34") ///
  (35/44=3  "35-44") ///  
  (45/54=4  "45-54") ///
  (55/64=5  "55-64") ///
  (65/150=6 "Above 64"), gen(age)

label var age "Age groups"
tab age, missing

* Years of education (numeric)
rename VD3005 years_educ
label var years_educ "Schooling, in years"
tab years_educ, missing

* Highest educational level (categorical)
gen     educ = 1 if inlist(VD3004, 1, 2) // No school (or primary school incomplete)
replace educ = 2 if inlist(VD3004, 3, 4) // Primary school (or high school incomplete)
replace educ = 3 if inlist(VD3004, 5, 6) // High school (or college incomplete)
replace educ = 4 if inlist(VD3004, 7)    // College (or above)

label define educ ///
  1 "Less than primary school" ///
  2 "Primary school" ///
  3 "High school" ///
  4 "College or above"

label values educ educ
label var educ "Highest educational level concluded"
tab educ, missing

* Age education interaction (indicators)
tab age educ, missing

tab age, gen(age_)
label var age_1 "Under 25"
label var age_2 "25-34"
label var age_3 "35-44"
label var age_4 "45-54"
label var age_5 "55-64"
label var age_6 "Over 64"
describe age_*

tab educ, gen(educ_)
label var educ_1 "Less than primary school"
label var educ_2 "Primary school"
label var educ_3 "High school"
label var educ_4 "College or above"
describe educ_*

local i = 0
foreach age of varlist age_* {
   local i = `i' + 1
   local j = 0
   foreach educ of varlist educ_* {
     local j = `j' + 1
     gen byte ae_`i'_`j' = `age' * `educ' 
   }
}

label var ae_1_1 "Under 25, less than primary school"
label var ae_1_2 "Under 25, primary school"
label var ae_1_3 "Under 25, high school"
label var ae_1_4 "Under 25, college or above"

label var ae_2_1 "25-34, less than primary school"
label var ae_2_2 "25-34, primary school"
label var ae_2_3 "25-34, high school"
label var ae_2_4 "25-34, college or above"

label var ae_3_1 "35-44, less than primary school"
label var ae_3_2 "35-44, primary school"
label var ae_3_3 "35-44, high school"
label var ae_3_4 "35-44, college or above"

label var ae_4_1 "45-54, less than primary school"
label var ae_4_2 "45-54, primary school"
label var ae_4_3 "45-54, high school"
label var ae_4_4 "45-54, college or above"

label var ae_5_1 "55-64, less than primary school"
label var ae_5_2 "55-64, primary school"
label var ae_5_3 "55-64, high school"
label var ae_5_4 "55-64, college or above"

label var ae_6_1 "Over 64, less than primary school"
label var ae_6_2 "Over 64, primary school"
label var ae_6_3 "Over 64, high school"
label var ae_6_4 "Over 64, college or above"

* Currently attending school (categorical)
gen attending_school = 0
replace attending_school = 1 if inlist(V3003A, 1, 2, 3, 4, 5, 6, 7)
replace attending_school = 2 if inlist(V3003A, 8, 9, 10, 11)

label define at_school ///
  0 "Not attending school" ///
  1 "Attending school" ///
  2 "Attending college or above"

label values attending_school at_school
label var attending_school "Currently attending school"

tab attending_school, missing

* Currently attending school (indicators)
gen at_school = (attending_school == 1)
gen at_college = (attending_school == 2)

label var at_school  "Attending school"
label var at_college "Attending college or above"

* Family-related attributes
***************************
sort hh_id ind_id

* Family position (categorical)
gen     family_position = 1   if inrange(V2005, 1, 1) // head
replace family_position = 2   if inrange(V2005, 2, 3) // partner
replace family_position = 300 if inrange(V2005, 4, 6) // son or daughter
replace family_position = 401 if inrange(V2005, 7, 17) & years_age < 22 // other young hh members
replace family_position = 402 if inrange(V2005, 7, 17) & years_age > 21 & years_age < 65 // other adult hh members
replace family_position = 403 if inrange(V2005, 7, 17) & years_age > 64 // other senior hh members
replace family_position = 900 if inrange(V2005, 18, 19) // domestic workers and their relatives

label define family_position 1 "Head" 2 "Partner" 300 "Son or daughter" 401 "Other young member" 402 "Other adult member" 403 "Other senior member" 900 "Domestic worker"
label values family_position family_position
label var family_position "Family position"

tab family_position, missing

* Differentiate between "with/no partner" and "with kids/no kids"
*****************************************************************

* Create household-date identifier (unique by household per period)
gen int_id_hh = hh_id + "-" + date_string
sort int_id_hh

* Is there a partner in the family?
bysort int_id_hh: egen has_partner = sum(family_position == 2)
label define has_partner 1 "with partner" 0 "no partner"
label values has_partner has_partner
label var has_partner "Family with a partner"
tabulate has_partner, missing

* Is there a child in the family?
bysort int_id_hh: egen has_kids = sum(family_position == 300)
replace has_kids = (has_kids !=0)
label define has_kids 1 "with child" 0 "no child"
label values has_kids has_kids
label var has_kids "Family with children"
tabulate has_kids, missing

codebook family_position

replace family_position = 100 if family_position == 1 & has_partner == 0 & has_kids == 0
replace family_position = 101 if family_position == 1 & has_partner == 0 & has_kids == 1

replace family_position = 110 if family_position == 1 & has_partner == 1 & has_kids == 0
replace family_position = 111 if family_position == 1 & has_partner == 1 & has_kids == 1

replace family_position = 210 if family_position == 2 & has_kids == 0
replace family_position = 211 if family_position == 2 & has_kids == 1

label define family_position 100 "Head, no partner, no kids", add
label define family_position 101 "Head, no partner, with kids", add

label define family_position 110 "Head, with partner, no kids", add
label define family_position 111 "Head, with partner, with kids", add

label define family_position 210 "Partner, no kids", add
label define family_position 211 "Partner, with kids", add

codebook family_position

* Household positions as indicators
gen byte fp_h_wp_nk = (family_position == 110)
gen byte fp_h_wp_wk = (family_position == 111)

gen byte fp_h_np_nk = (family_position == 100)
gen byte fp_h_np_wk = (family_position == 101)

gen byte fp_p_nk = (family_position == 210)
gen byte fp_p_wk = (family_position == 211)

gen byte fp_child = (family_position == 300)

gen byte fp_oy = (family_position == 401)
gen byte fp_oa = (family_position == 402)
gen byte fp_os = (family_position == 403)

label var fp_h_np_nk "Head, no partner, no kids"
label var fp_h_np_wk "Head, no partner, with kids"

label var fp_h_wp_nk "Head, with partner, no kids"
label var fp_h_wp_wk "Head, with partner, with kids"

label var fp_p_nk "Partner, no kids"
label var fp_p_wk "Partner, with kids"

label var fp_child "Child"

label var fp_oy "Other young hh member"
label var fp_oa "Other adult hh member"
label var fp_os "Other senior hh member"

describe fp_*

* Other family composition variables
************************************

* How many family members?
bysort int_id_hh: egen family_size = sum(!inlist(V2005, 18, 19))
label var family_size "Family size (excl. domestic workers)"

* How many kids in the family? (age criteria)
bysort int_id_hh: egen n_kids = sum(years_age < 15 & !inlist(V2005, 18, 19))
label var n_kids "N. kids (less than 15 years old)"

* How many young family members? (age criteria)
bysort int_id_hh: egen n_youngs = sum(years_age > 14 & years_age < 22 & !inlist(V2005, 18, 19))
label var n_youngs "N. young members (15-21)"

* How many adult family members? (age criteria)
bysort int_id_hh: egen n_adults = sum(years_age > 21 & years_age < 65 & !inlist(V2005, 18, 19))
label var n_adults "N. adult members (22-64)"

* How many senior family members? (age criteria)
bysort int_id_hh: egen n_seniors = sum(years_age > 64 & !inlist(V2005, 18, 19))
label var n_seniors "N. elderly members (65+)"

* Work-related variables
************************

* NOTE: Whenever multiple jobs and multiple references are available, we'll prefer the ones referring to the MAIN JOB and the USUAL values.

* Work states labels
label define work_state ///
  1 "Own-account worker" ///
  2 "Employee" ///
  3 "Employer" ///
  4 "Aux worker" ///
  5 "Unemployed" ///
  6 "Inactive" ///
  7 "Below working age" ///
  8 "Domestic worker"

* Work states
*************
gen     work_state_name = "Own-account worker" if inlist(VD4008, 2, 5) // incl. domestic workers
replace work_state_name = "Employee"           if inlist(VD4008, 1, 3)  
replace work_state_name = "Employer"           if inlist(VD4008, 4)
replace work_state_name = "Unemployed"         if inlist(VD4002, 2)  
replace work_state_name = "Inactive"           if inlist(VD4001, 2) 
replace work_state_name = "Inactive"           if inlist(VD4008, 6) // incl. unpaid aux worker 
replace work_state_name = "Below working age"  if years_age < 14 

label var work_state_name "Status in the labor market"

* Work states as factor
encode work_state_name, gen(work_state) label(work_state)
label var work_state "Status in the labor market"
tabulate work_state, missing

* Work states (IBGE default)
****************************
gen     work_state_ibge_name = "Own-account worker" if inlist(VD4008, 5)
replace work_state_ibge_name = "Employee"           if inlist(VD4008, 1, 2, 3) // incl. domestic workers
replace work_state_ibge_name = "Employer"           if inlist(VD4008, 4)
replace work_state_ibge_name = "Aux worker"         if inlist(VD4008, 6)
replace work_state_ibge_name = "Unemployed"         if inlist(VD4002, 2)  
replace work_state_ibge_name = "Inactive"           if inlist(VD4001, 2)  
replace work_state_ibge_name = "Below working age"  if years_age < 14 

label var work_state_ibge_name "Status in the labor market (IBGE default)"

* Work states (IBGE default) as factor
encode work_state_ibge_name, gen(work_state_ibge) label(work_state)
label var work_state_ibge "Status in the labor market (IBGE default)"
tabulate work_state_ibge, missing

* Work states (dom workers as own group)
****************************************
gen     work_state_dom_name = "Own-account worker" if inlist(VD4008, 5)
replace work_state_dom_name = "Employee"           if inlist(VD4008, 1, 3)  
replace work_state_dom_name = "Domestic worker"    if inlist(VD4008, 2)  
replace work_state_dom_name = "Employer"           if inlist(VD4008, 4)
replace work_state_dom_name = "Unemployed"         if inlist(VD4002, 2)  
replace work_state_dom_name = "Inactive"           if inlist(VD4001, 2) 
replace work_state_dom_name = "Inactive"           if inlist(VD4008, 6) // incl. unpaid aux worker 
replace work_state_dom_name = "Below working age"  if years_age < 14 

label var work_state_dom_name "Status in the labor market (domestic workers as own group)"

* Work states (IBGE default) as factor
encode work_state_dom_name, gen(work_state_dom) label(work_state)
label var work_state_dom "Status in the labor market (domestic workers as own group)"
tabulate work_state_dom, missing

* Check
tabulate work_state work_state_ibge
tabulate work_state work_state_dom
tabulate work_state_ibge work_state_dom

* Employment duration (midpoint and intervals)
gen how_long_ea = 0    if V4040 == 1
gen how_long_e  = 0.25 if V4040 == 1
gen how_long_eb = 0.5  if V4040 == 1

replace how_long_ea = V40401 - 0.5 if V4040 == 2
replace how_long_e  = V40401       if V4040 == 2
replace how_long_eb = V40401 + 0.5 if V4040 == 2

replace how_long_ea = V40402 + 11.5 if V4040 == 3
replace how_long_e =  V40402 + 12   if V4040 == 3
replace how_long_eb = V40402 + 12.5 if V4040 == 3

replace how_long_ea = V40403 * 12 - 6 if V4040 == 4
replace how_long_e  = V40403 * 12     if V4040 == 4
replace how_long_eb = V40403 * 12 + 6 if V4040 == 4

label var how_long_e  "Employment duration (in months)"
label var how_long_ea "Employment duration range (at least x months)"
label var how_long_eb "Employment duration range (up to x months)"

drop V4040 V40401 V40402 V40403

* Unmployment duration (midpoint and intervals)
gen how_long_ua = 0    if V4076 == 1
gen how_long_u  = 0.25 if V4076 == 1
gen how_long_ub = 0.5  if V4076 == 1

replace how_long_ua = V40761 - 0.5 if V4076 == 2
replace how_long_u  = V40761       if V4076 == 2
replace how_long_ub = V40761 + 0.5 if V4076 == 2

replace how_long_ua = V40762 + 11.5 if V4076 == 3
replace how_long_u =  V40762 + 12   if V4076 == 3
replace how_long_ub = V40762 + 12.5 if V4076 == 3

replace how_long_ua = V40763 * 12 - 6 if V4076 == 4
replace how_long_u  = V40763 * 12     if V4076 == 4
replace how_long_ub = V40763 * 12 + 6 if V4076 == 4

label var how_long_u  "Unemployment duration (in months)"
label var how_long_ua "Unemployment duration range (at least x months)"
label var how_long_ub "Unemployment duration range (up to x months)"

drop V4076 V40761 V40762 V40763

* Job position
**************
rename V4010 occupation 
label var occupation "See ocupação COD (V4010)"

rename VD4011 occupation_top
label var occupation_top "Cross check top level (VD4011)"

* Job sector
************
rename V4013 sector
label var sector "See atividade CNAE (V4013)"

* Formality status
******************
gen formal = 0 if inlist(work_state_name, "Own-account worker", "Employee", "Employer")
replace formal = 1 if V4019 == 1 & inlist(work_state_name, "Own-account worker", "Employee", "Employer")
replace formal = 1 if V4029 == 1 & inlist(work_state_name, "Own-account worker", "Employee", "Employer")

label var formal "Formal occupation (carteira assinada or cnpj)"
tab work_state formal, missing

* Workplace (available only after 2018q1)
*****************************************

tab V4020 V4022, missing

label define workplace_lbl ///
	1 "Dedicated store, office" ///
	2 "Place defined by employer or client" ///
	3 "Employer's or client's home" ///
	4 "Worker's house (exclusive area)" ///
	5 "Worker's house (non-exclusive area)" ///
	6 "Worker's vehicle" ///
	7 "Public space" ///
	8 "Other places" // Farming areas, other business, others

gen workplace = 0 if !missing(V4020)
replace workplace = 1 if V4020 == 1 // Dedicated store, office
replace workplace = 8 if V4020 == 2 // Farming areas
replace workplace = 8 if V4022 == 1 // Other business
replace workplace = 2 if V4022 == 2 // Place defined by employer or client
replace workplace = 3 if V4022 == 3 // Employer's or client's home
replace workplace = 4 if V4022 == 4 // Worker's house (exclusive area)
replace workplace = 5 if V4022 == 5 // Worker's house (non-exclusive area)
replace workplace = 6 if V4022 == 6 // Worker's vehicle
replace workplace = 7 if V4022 == 7 // Public space
replace workplace = 8 if V4022 == 8 // Others

replace workplace = 3 if VD4008 == 2 & year_quarter >= 698 // domestic workers (they are not asked this question)

label values workplace workplace_lbl
label var workplace "Usual workplace"

* Check
tab workplace year, missing
tab workplace work_state if year == 2, missing
tab workplace work_state_ibge if year == 2, missing

* Dummies
tab workplace, gen(wp_)

label var wp_1 "Dedicated store, office"
label var wp_2 "Place defined by employer or client"
label var wp_3 "Employer's or client's home"
label var wp_4 "Worker's house (exclusive area)"
label var wp_5 "Worker's house (non-exclusive area)"
label var wp_6 "Worker's vehicle"
label var wp_7 "Public space"
label var wp_8 "Other places"

*********************************
* [E] Adjust monetary variables *
*********************************

* Deflate monthly earnings from main job (usual values)
gen winc = deflator_habitual * VD4016
label var winc "Work income, main job, monthly"

* Define log available work income
gen ln_winc          = log(winc)
gen ln_winc_oaw      = log(winc) if work_state_name == "Own-account worker"
gen ln_winc_employee = log(winc) if work_state_name == "Employee"
gen ln_winc_employer = log(winc) if work_state_name == "Employer"

label var ln_winc          "Log work income"
label var ln_winc_oaw      "Own-account workers' log work income"
label var ln_winc_employee "Employees' log work income"
label var ln_winc_employer "Employers' log work income"

* Working hours
gen main_job_work_hours = 4 * V4039
label var main_job_work_hours "Monthly work hours"
drop V4039 VD4031

* Full time
gen full_time = (main_job_work_hours > 100)
replace full_time = . if missing(main_job_work_hours)
label var full_time "Over 100 monthly hours in the main job" 

* Wrap up and export files
**************************
gen survey = "pnad"
label var survey "Data source"

global export_set ///
  survey state region strata_id urban psu_id pweight hh_id ind_id int_id work_state work_state_name work_state_ibge work_state_ibge_name work_state_dom work_state_dom_name ///
  year quarter year_quarter state_date date date_string interview_wave ///
  family_position male white race_gender years_age age years_educ educ attending_school ///
  family_size n_kids n_youngs n_adults n_seniors ///
  main_job_work_hours full_time occupation occupation_top sector formal workplace ///
  winc ln_winc ln_winc_oaw ln_winc_employee ln_winc_employer ///
  how_long_ua how_long_u how_long_ub how_long_ea how_long_e how_long_eb ///
  rg_* fp_* age_* educ_* ae_* at_* wp_* region_*

keep $export_set
order $export_set
sort strata_id int_id
duplicates drop
compress
describe

* How many households?
unique hh_id, by(date) detail
drop _Unique

* How many individuals?
codebook ind_id

* Which periods are covered by the data and how many observation per period?
tabulate date

* Export
save "${analysis}/data/2_3_pnad_all.dta", replace

******************************************
* Subset to urban working age population *
******************************************
use "${analysis}/data/2_3_pnad_all.dta", clear
svyset psu_id [pweight = pweight], strata(strata_id) singleunit(centered) 

* How many rural?
svy: mean urban
mean urban

* Remove rural areas
drop if urban == 0

* How many outside working age in urban pop?
gen outside_work_age = (years_age < 14 | years_age > 64)
svy: mean outside_work_age
mean outside_work_age

* Remove observations below working age
drop if years_age < 14 & !missing(years_age)

label define age 1 "14-24", modify

label var ae_1_1 "14-24, less than primary school"
label var ae_1_2 "14-24, primary school"
label var ae_1_3 "14-24, high school"
label var ae_1_4 "14-24, college or above"

* Remove observations above working age
drop if years_age > 64 & !missing(years_age)

drop age_6
drop fp_os

drop ae_6_1
drop ae_6_2
drop ae_6_3
drop ae_6_4

* Drop domestic workers living at their client's house
gen domestic = (family_position == "Domestic worker":family_position)
svy: mean domestic
mean domestic
drop if domestic == 1
drop domestic

*************************
* Winsorize work income *
*************************
gen fweight = round(pweight)

* Winsorise available work income
sum winc [fweight = fweight], detail

scalar winc_low_bound = r(p1)
scalar winc_up_bound = r(p99)

replace winc = scalar(winc_low_bound) if winc < scalar(winc_low_bound) & !missing(winc)
replace winc = scalar(winc_up_bound)  if winc > scalar(winc_up_bound)  & !missing(winc)

sum winc [fweight = fweight], detail

* Winsorise log available work income
sum ln_winc [fweight = fweight], detail

scalar ln_winc_low_bound = r(p1)
scalar ln_winc_up_bound = r(p99)

replace ln_winc = scalar(ln_winc_low_bound) if ln_winc < scalar(ln_winc_low_bound) & !missing(ln_winc)
replace ln_winc = scalar(ln_winc_up_bound)  if ln_winc > scalar(ln_winc_up_bound)  & !missing(ln_winc)

sum ln_winc [fweight = fweight], detail

sum     ln_winc_oaw [fweight = fweight], detail
replace ln_winc_oaw      = ln_winc if work_state_name == "Own-account worker" 
sum     ln_winc_oaw [fweight = fweight], detail

sum     ln_winc_employee [fweight = fweight], detail
replace ln_winc_employee = ln_winc if work_state_name == "Employee"
sum     ln_winc_employee [fweight = fweight], detail

sum     ln_winc_employer [fweight = fweight], detail
replace ln_winc_employer = ln_winc if work_state_name == "Employer"
sum     ln_winc_employer [fweight = fweight], detail

drop fweight

* Export sub sample
*******************

* Save only the covariates we'll use for the estimations
global export_set ///
  survey state region strata_id psu_id pweight hh_id ind_id int_id ///
  date date_string interview_wave ///
  family_position male white race_gender years_age age years_educ educ attending_school ///
  family_size n_kids n_youngs n_adults n_seniors ///
  work_state work_state_name work_state_ibge work_state_ibge_name work_state_dom work_state_dom_name main_job_work_hours full_time occupation occupation_top sector formal workplace ///
  winc ln_winc ln_winc_oaw ln_winc_employee ln_winc_employer ///
  how_long_ea how_long_e how_long_eb how_long_ua how_long_u how_long_ub ///
  rg_* fp_* age_* educ_* ae_* at_* wp_* region_* 

keep $export_set
order $export_set
sort strata_id int_id
duplicates drop
compress
describe

* How many households?
unique hh_id, by(date) detail
drop _Unique

* How many individuals?
codebook ind_id

* Which periods are covered by the data and how many observation per period?
tabulate date

* Export the data
save "${analysis}/data/2_3_pnad_clean.dta", replace


* End of script
***************
cap log close